Load necessary packages
library(caret) # Classification and Regression Training
## Warning: package 'caret' was built under R version 3.5.2
## Loading required package: lattice
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.5.2
library(corrplot)
## corrplot 0.84 loaded
library(dplyr) # A Grammar of Data Manipulations
## Warning: package 'dplyr' was built under R version 3.5.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(forcats) # Tools for Working with Categorical Variables (Factors)
## Warning: package 'forcats' was built under R version 3.5.2
library(gdata) # Various R Programming Tools for Data Manipulation
## gdata: read.xls support for 'XLS' (Excel 97-2004) files ENABLED.
##
## gdata: read.xls support for 'XLSX' (Excel 2007+) files ENABLED.
##
## Attaching package: 'gdata'
## The following objects are masked from 'package:dplyr':
##
## combine, first, last
## The following object is masked from 'package:stats':
##
## nobs
## The following object is masked from 'package:utils':
##
## object.size
## The following object is masked from 'package:base':
##
## startsWith
library(ggplot2) # Create Elegant Data Visualizations Using the Grammar of Graphics
library(ggrepel) # Automatically Position Non-Overlapping Text Labels with ‘ggplot2’
## Warning: package 'ggrepel' was built under R version 3.5.2
library(ggridges) # Ridgeline Plots in ‘ggplot2’
##
## Attaching package: 'ggridges'
## The following object is masked from 'package:ggplot2':
##
## scale_discrete_manual
library(ggthemes) # Extra Themes, Scales, and Geoms for ‘ggplot2’
## Warning: package 'ggthemes' was built under R version 3.5.2
library(gmodels) # Various R Programming Tools for Model Fitting
library(grid) # The Grid Graphics Package
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:gdata':
##
## combine
## The following object is masked from 'package:dplyr':
##
## combine
library(knitr) # A General-Purpose Package for Dynamic Report Generation in R
## Warning: package 'knitr' was built under R version 3.5.2
library(lmtest) # Testing Linear Regression Models
## Warning: package 'lmtest' was built under R version 3.5.2
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(plyr) # Tools for Splitting, Applying and Combining Data
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
##
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
library(purrr) # Functional Programming Tools
## Warning: package 'purrr' was built under R version 3.5.2
##
## Attaching package: 'purrr'
## The following object is masked from 'package:plyr':
##
## compact
## The following object is masked from 'package:gdata':
##
## keep
## The following object is masked from 'package:caret':
##
## lift
library(readr) # Read Rectangular Text Data
library(rpart)
## Warning: package 'rpart' was built under R version 3.5.2
library(rattle)
## Rattle: A free graphical interface for data science with R.
## Version 5.2.0 Copyright (c) 2006-2018 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
library(scales) # Scale Functions for Visualization
##
## Attaching package: 'scales'
## The following object is masked from 'package:readr':
##
## col_factor
## The following object is masked from 'package:purrr':
##
## discard
library(shiny) # Web Application Framework for R
## Warning: package 'shiny' was built under R version 3.5.2
library(skimr) # Compact and Flexible Summaries of Data
## Warning: package 'skimr' was built under R version 3.5.2
##
## Attaching package: 'skimr'
## The following object is masked from 'package:knitr':
##
## kable
## The following object is masked from 'package:stats':
##
## filter
library(stringr) # Simple, Consistent Wrappers for Common String Operations
library(tibble) # Simple Data Frames
## Warning: package 'tibble' was built under R version 3.5.2
library(tidyr) # Easily Tidy Data with ‘spread()’ and ‘gather()’ Functions
## Warning: package 'tidyr' was built under R version 3.5.2
library(tidyverse) # Install and Load ‘Tidyverse’ Packages
library(usmap)
## Warning: package 'usmap' was built under R version 3.5.2
library(rworldmap)
## Loading required package: sp
## ### Welcome to rworldmap ###
## For a short introduction type : vignette('rworldmap')
library(ggmap)
## Warning: package 'ggmap' was built under R version 3.5.2
## Google's Terms of Service: https://cloud.google.com/maps-platform/terms/.
## Please cite ggmap if you use it! See citation("ggmap") for details.
library(maps)
##
## Attaching package: 'maps'
## The following object is masked from 'package:purrr':
##
## map
## The following object is masked from 'package:plyr':
##
## ozone
library(mapdata)
library(cowplot)
## Warning: package 'cowplot' was built under R version 3.5.2
##
## ********************************************************
## Note: As of version 1.0.0, cowplot does not change the
## default ggplot2 theme anymore. To recover the previous
## behavior, execute:
## theme_set(theme_cowplot())
## ********************************************************
##
## Attaching package: 'cowplot'
## The following object is masked from 'package:ggmap':
##
## theme_nothing
## The following object is masked from 'package:ggthemes':
##
## theme_map
library("googleway")
library("ggspatial")
library("rnaturalearth")
library("rnaturalearthdata")
theme_set(theme_bw())
library("sf")
## Warning: package 'sf' was built under R version 3.5.2
## Linking to GEOS 3.7.2, GDAL 2.4.2, PROJ 5.2.0
library(gridExtra)
titletheme_small <- theme(plot.title = element_text(size=9,vjust=0.1))
titletheme <- theme(plot.title = element_text(size=12,vjust=0.1))
library("viridis")
## Loading required package: viridisLite
##
## Attaching package: 'viridis'
## The following object is masked from 'package:scales':
##
## viridis_pal
world <- ne_countries(scale = "medium", returnclass = "sf") #for mapping
#install.packages("tableone")
library("tableone")
## Warning: package 'tableone' was built under R version 3.5.2
Creating dfs of color schemes I may use
mycolors <- c("brown1","lightseagreen","goldenrod1","slateblue")
species_colors <- c("royalblue","salmon","darkmagenta")
pisaster_colors <- c("thistle1","plum","orchid3","darkmagenta")
RMSE_colors <- c("slategray1","steelblue3","thistle2","plum3")
pisaster_size_colors2 <- c("lightsteelblue1","skyblue","slateblue3","slateblue4")
pisaster_size_colors <- c("lavender","mediumpurple3","slateblue3","slateblue4")
Load cleaned data
#clean_data <- readRDS("./data/processed_data/processeddata.rds")
clean_data <- readRDS("../../data/processed_data/processeddata.rds")
skim(clean_data)
## Skim summary statistics
## n obs: 13165
## n variables: 23
##
## ── Variable type:factor ───────────────────────────────────────────────
## variable missing complete n n_unique
## bioregion 0 13165 13165 6
## georegion 0 13165 13165 9
## group_code 0 13165 13165 9
## group_code_UCSC_other 0 13165 13165 2
## island 0 13165 13165 6
## marine_season_code 0 13165 13165 57
## marine_site_name 0 13165 13165 77
## method_code 0 13165 13165 4
## method_code_IP_other 0 13165 13165 2
## mpa_region 0 13165 13165 6
## season_name 0 13165 13165 4
## site_code 0 13165 13165 77
## species_code 0 13165 13165 3
## state 0 13165 13165 4
## top_counts ordered
## San: 5608, Oly: 4138, Gov: 2366, Sal: 652 FALSE
## CA : 5382, CA : 2592, CA : 1660, OR: 1386 FALSE
## UCS: 9657, UCL: 2032, PBN: 400, SSS: 366 FALSE
## UCS: 9657, Oth: 3508, NA: 0 FALSE
## Mai: 12364, Bar: 279, Sad: 250, Hat: 150 FALSE
## SU1: 440, SU1: 403, SU1: 392, FA1: 376 FALSE
## Mil: 524, Sco: 513, Sti: 472, End: 468 FALSE
## IP: 12012, BT2: 782, GSE: 336, TS3: 35 FALSE
## IP: 12012, Oth: 1153, NA: 0 FALSE
## Cen: 5382, Sou: 2627, NUL: 1932, Nor: 1660 FALSE
## Sum: 4552, Fal: 4489, Spr: 4112, Win: 12 FALSE
## MCR: 524, SCT: 513, SWC: 472, END: 468 FALSE
## P.o: 11416, K FALSE
## CA: 10548, OR: 1386, WA: 865, AK: 366 FALSE
##
## ── Variable type:integer ──────────────────────────────────────────────
## variable missing complete n mean sd p0 p25
## marine_common_season 0 13165 13165 116.82 19.23 78 101
## marine_common_year 0 13165 13165 2009.7 4.81 2000 2006
## marine_sort_order 0 13165 13165 5566.24 1125.15 1720 5020
## season_sequence 0 13165 13165 2.03 0.81 1 1
## size_bin 0 13165 13165 82.44 42.2 5 50
## size_sort_order 0 13165 13165 9.57 4.15 1 6
## total 0 13165 13165 9.92 16.43 1 2
## p50 p75 p100 hist
## 118 131 152 ▅▅▆▇▇▇▇▅
## 2010 2013 2018 ▃▅▅▇▇▆▆▆
## 6090 6370 7650 ▁▁▁▂▃▃▇▁
## 2 3 4 ▇▁▇▁▁▇▁▁
## 80 110 320 ▅▇▇▃▁▁▁▁
## 10 12 33 ▅▇▇▃▁▁▁▁
## 4 11 360 ▇▁▁▁▁▁▁▁
##
## ── Variable type:numeric ──────────────────────────────────────────────
## variable missing complete n mean sd p0 p25 p50
## latitude 0 13165 13165 38.67 5.2 32.87 34.73 36.62
## longitude 0 13165 13165 -122.25 2.86 -135.38 -123.98 -121.94
## p75 p100 hist
## 41.65 57.05 ▇▆▃▂▁▁▁▁
## -120.62 -117.25 ▁▁▁▁▅▇▆▃
temp_and_salt <- ggplot(WQ_clean_data, aes(x = salinity, y = water_temp, color = island_side)) + geom_jitter()
Outcome of interest is total, but really just for Pisaster.
P.ochra_clean_data <- filter(clean_data, clean_data$species_code == "P.ochraceus")
#names(P.ochra_clean_data)
P.ochra_numer_data <- P.ochra_clean_data %>% dplyr::select(total, marine_common_season, marine_common_year, marine_sort_order, season_sequence, size_bin, size_sort_order, latitude, longitude)
P.ochra_numer_data <- P.ochra_numer_data %>% mutate(season_year = marine_common_season) %>% dplyr::select(-marine_common_season)
P.ochra_numer_data <- P.ochra_numer_data %>% mutate(year = marine_common_year) %>% dplyr::select(-marine_common_year)
P.ochra_numer_data <- P.ochra_numer_data %>% mutate(season_num = season_sequence) %>% dplyr::select(-season_sequence)
P.ochra_numer_data <- P.ochra_numer_data %>% mutate(size_sort = size_sort_order) %>% dplyr::select(-size_sort_order)
P.ochra_numer_data <- P.ochra_numer_data %>% mutate(order_site = marine_sort_order) %>% dplyr::select(-marine_sort_order)
visdat::vis_dat(P.ochra_numer_data)
myPalette <- colorRampPalette(viridis_pal(begin = 0, end = 1,option = "plasma")(8),)
plasma_continuous <- scale_colour_gradientn(colours = myPalette(91328))
p1log <- P.ochra_numer_data %>% select_if(is.numeric) %>% gather(-total, key = "var", value = "value") %>%
ggplot() + geom_jitter(aes(x = value, y=log(total)), alpha = 0.3, width = 0.4)+
facet_wrap(~ var, scales = "free",nrow=4) + theme_bw()
print(p1log)
#ggsave(filename = "../../results/Continuous_Outcome_Modeling_results/1.count_vs_numerical.png",plot = p1log, width = 5, height = 4)
colfunc_abundance <-colorRampPalette(c("plum","darkmagenta","purple4"))
title <- "log(Abundance)"
log_abundance_colours <- scale_colour_gradientn(title,colours = colfunc_abundance(50))
latitude_vs_abundance <- P.ochra_numer_data %>% ggplot(aes(y = log(total), x=latitude, col=log(total))) + geom_jitter(size=8,alpha=0.1,width=0.1) + ylab("log(Abundance)") + xlab("Latitude") + log_abundance_colours + theme(legend.position = "none",axis.title.y=element_text(size=12),axis.text.y=element_text(size=12))
#latitude_vs_abundance
longitude_vs_abundance <- P.ochra_numer_data %>% ggplot(aes(y = log(total), x=longitude, col=log(total))) + geom_jitter(size=8,alpha=0.1,width=0.1) + ylab("log(Abundance)") + xlab("Longitude") + log_abundance_colours + theme(legend.position = "none",axis.text.y=element_text(size=12),axis.title.y=element_blank())
#longitude_vs_abundance
site_vs_abundance <- P.ochra_numer_data %>% ggplot(aes(y = log(total), x=order_site, col=log(total))) + geom_jitter(size=8,alpha=0.1,width=0.3) + ylab("log(Abundance)") + xlab("Sites (Ordered N to S)") + log_abundance_colours + theme(legend.position = "none",axis.title.y=element_text(size=12),axis.text.y=element_text(size=12))
#site_vs_abundance
seasonyear_vs_abundance <- P.ochra_numer_data %>% ggplot(aes(y = log(total), x=season_year, col=log(total))) + geom_jitter(size=8,alpha=0.25,width=0.1) + ylab("log(Abundance)") + xlab("Season & Year") + log_abundance_colours + theme(legend.position = "none",axis.text.y=element_text(size=12),axis.title.y=element_blank())
#seasonyear_vs_abundance
sizebin_vs_abundance <- P.ochra_numer_data %>% ggplot(aes(y = log(total), x=size_bin, col=log(total))) + geom_jitter(size=8,alpha=0.25,width=0.1) + ylab("log(Abundance)") + xlab("Size Class") + log_abundance_colours + theme(legend.position = "none",axis.text.y=element_text(size=12),axis.title.y=element_blank())
#sizebin_vs_abundance
year_vs_abundance <- P.ochra_numer_data %>% ggplot(aes(y = log(total), x=year, col=log(total))) + geom_jitter(size=8,alpha=0.25,width=0.1) + ylab("log(Abundance)") + xlab("Year") + log_abundance_colours + theme(legend.position = "none",axis.title.y=element_text(size=12),axis.text.y=element_text(size=12))
#year_vs_abundance
grid.arrange(latitude_vs_abundance,longitude_vs_abundance,nrow=1,top="(A) Spatial Numeric Predictors")
ggsave(filename = "../../results/Continuous_Outcome_Modeling_results/1A.Spatial_Numeric.png",plot =grid.arrange(latitude_vs_abundance,longitude_vs_abundance,nrow=1,top="(A) Spatial Numeric Predictors") , width = 5, height = 4)
grid.arrange(year_vs_abundance,seasonyear_vs_abundance,nrow=1,top="(B) Temporal Numeric Predictors")
ggsave(filename = "../../results/Continuous_Outcome_Modeling_results/1B.Temporal_Numeric.png",plot=grid.arrange(year_vs_abundance,seasonyear_vs_abundance,nrow=1,top="(B) Temporal Numeric Predictors"), width = 5, height = 4)
grid.arrange(site_vs_abundance,sizebin_vs_abundance,nrow=1,top="(C) Misc. Numeric Predictors")
ggsave(filename = "../../results/Continuous_Outcome_Modeling_results/1C.Misc_Numeric.png",plot =grid.arrange(site_vs_abundance,sizebin_vs_abundance,nrow=1,top="(C) Misc. Numeric Predictors"), width = 5, height = 4)
P.ochra_numer_data_corr <- P.ochra_numer_data %>% select_if(is.numeric) %>% cor()
P.ochra_numer_data_corr <- corrplot::corrplot(P.ochra_numer_data_corr)
Okay, so a few of these are highly correlated, and for good reason. The only one I can really think to justify getting rid of one of the two correlated variables for is the size_sort/size_bin. size_bin is as informative as size_sort, and more, so I’ll get rid of size_sort
P.ochra_categ_data <- P.ochra_clean_data %>% select_if(is.factor)
P.ochra_categ_data$total <- P.ochra_clean_data$total
P.ochra_categ_data <- P.ochra_categ_data %>% dplyr::select(-species_code)
p2log <- P.ochra_categ_data %>% gather(-total, -marine_season_code, -marine_site_name, -group_code_UCSC_other, -method_code_IP_other, key = "var", value = "value") %>% ggplot() + geom_violin(aes(x = value, y=log(total), col=value)) + geom_point(aes(x = value, y=log(total), col=value)) + facet_wrap(~ var, scales = "free",nrow=3) + theme_bw() + theme(legend.position = "none") #+
## Warning: attributes are not identical across measure variables;
## they will be dropped
scale_color_manual(values=pisaster_size_colors)
## <ggproto object: Class ScaleDiscrete, Scale, gg>
## aesthetics: colour
## axis_order: function
## break_info: function
## break_positions: function
## breaks: waiver
## call: call
## clone: function
## dimension: function
## drop: TRUE
## expand: waiver
## get_breaks: function
## get_breaks_minor: function
## get_labels: function
## get_limits: function
## guide: legend
## is_discrete: function
## is_empty: function
## labels: waiver
## limits: NULL
## make_sec_title: function
## make_title: function
## map: function
## map_df: function
## n.breaks.cache: NULL
## na.translate: TRUE
## na.value: NA
## name: waiver
## palette: function
## palette.cache: NULL
## position: left
## range: <ggproto object: Class RangeDiscrete, Range, gg>
## range: NULL
## reset: function
## train: function
## super: <ggproto object: Class RangeDiscrete, Range, gg>
## reset: function
## scale_name: manual
## train: function
## train_df: function
## transform: function
## transform_df: function
## super: <ggproto object: Class ScaleDiscrete, Scale, gg>
print(p2log)
#ggsave(filename = "../../results/Continuous_Outcome_Modeling_results/2.count_vs_categorical.png",plot = p2log, width = 5, height = 4)
#edit x= ___, xlab("_____"), axis.text.x size
bioregion_vs_abundance <- P.ochra_categ_data %>% ggplot(aes(y = log(total), x=bioregion, col=log(total))) + geom_jitter(size=4,alpha=0.25,width=0.4,height=0.25) + xlab("Bioregion") + ylab("log(Abundance)") + theme(axis.text.x = element_text(size=8,angle=15,hjust=0.8),axis.title.x=element_text(size=12),legend.position = "none",axis.title.y=element_text(size=12),axis.text.y=element_text(size=12)) + log_abundance_colours
#bioregion_vs_abundance
georegion_vs_abundance <- P.ochra_categ_data %>% ggplot(aes(y = log(total), x=georegion, col=log(total))) + geom_jitter(size=4,alpha=0.25,width=0.4,height=0.25) + xlab("Georegion") + ylab("log(Abundance)") + theme(axis.text.x = element_text(size=8,angle=15,hjust=0.8),axis.title.x=element_text(size=12),legend.position = "none",axis.title.y=element_text(size=12),axis.text.y=element_text(size=12)) + log_abundance_colours
#georegion_vs_abundance
groupcode_vs_abundance <- P.ochra_categ_data %>% ggplot(aes(y = log(total), x=group_code, col=log(total))) + geom_jitter(size=4,alpha=0.25,width=0.4,height=0.25) + xlab("Surveying Group") + ylab("log(Abundance)") + theme(axis.text.x = element_text(size=9,angle=15,hjust=0.8),axis.title.x=element_text(size=12),legend.position = "none",axis.title.y=element_text(size=12),axis.text.y=element_text(size=12)) + log_abundance_colours
#groupcode_vs_abundance
island_vs_abundance <- P.ochra_categ_data %>% ggplot(aes(y = log(total), x=island, col=log(total))) + geom_jitter(size=4,alpha=0.25,width=0.4,height=0.25) + xlab("Island Location?") + ylab("log(Abundance)") + theme(axis.text.x = element_text(size=10),axis.title.x=element_text(size=12),legend.position = "none",axis.title.y=element_text(size=10),axis.text.y=element_text(size=12)) + log_abundance_colours
#island_vs_abundance
method_vs_abundance <- P.ochra_categ_data %>% ggplot(aes(y = log(total), x=method_code, col=log(total))) + geom_jitter(size=4,alpha=0.25,width=0.4,height=0.35) + xlab("Sampling Method") + ylab("log(Abundance)") + theme(axis.text.x = element_text(size=10),axis.title.x=element_text(size=12),legend.position = "none",axis.title.y=element_text(size=12),axis.text.y=element_text(size=12)) + log_abundance_colours
#method_vs_abundance
mpa_vs_abundance <- P.ochra_categ_data %>% ggplot(aes(y = log(total), x=mpa_region, col=log(total))) + geom_jitter(size=4,alpha=0.2,width=0.4,height=0.25) + xlab("MPA Region") + ylab("log(Abundance)") + theme(axis.text.x = element_text(size=8,angle=15,hjust=0.8),axis.title.x=element_text(size=12),legend.position = "none",axis.title.y=element_text(size=12),axis.text.y=element_text(size=10)) + log_abundance_colours
#mpa_vs_abundance
season_vs_abundance <- P.ochra_categ_data %>% ggplot(aes(y = log(total), x=season_name, col=log(total))) + geom_jitter(size=4,alpha=0.25,width=0.4,height=0.35) + xlab("Season") + ylab("log(Abundance)") + theme(axis.text.x = element_text(size=12),axis.title.x=element_text(size=12),legend.position = "none",axis.title.y=element_text(size=12),axis.text.y=element_text(size=12)) + log_abundance_colours
#season_vs_abundance
site_vs_abundance <- P.ochra_categ_data %>% ggplot(aes(y = log(total), x=site_code, col=log(total))) + geom_jitter(size=4,alpha=0.25,width=0.4,height=0.35) + xlab("Site") + ylab("log(Abundance)") + theme(axis.text.x = element_blank(),axis.title.x=element_text(size=12),panel.grid.major = element_blank(), panel.grid.minor = element_blank(),legend.position = "none",axis.title.y=element_text(size=10),axis.text.y=element_text(size=12)) + log_abundance_colours
#site_vs_abundance
state_vs_abundance <- P.ochra_categ_data %>% ggplot(aes(y = log(total), x=state, col=log(total))) + geom_jitter(size=4,alpha=0.25,width=0.4,height=0.35) + xlab("State") + ylab("log(Abundance)") + theme(axis.text.x = element_text(size=10),axis.title.x=element_text(size=12),legend.position = "none",axis.title.y=element_text(size=12),axis.text.y=element_text(size=12)) + log_abundance_colours
#state_vs_abundance
grid.arrange(groupcode_vs_abundance,method_vs_abundance,season_vs_abundance,nrow=3,top="(A) Sample Info Categorical Predictors")
ggsave(filename = "../../results/Continuous_Outcome_Modeling_results/2A.SampleInfo_Cat.png",plot =grid.arrange(groupcode_vs_abundance,method_vs_abundance,season_vs_abundance,nrow=3,top="(A) Sample Info Categorical Predictors"), width = 5, height = 4)
######################################################################################################
grid.arrange(bioregion_vs_abundance,georegion_vs_abundance,state_vs_abundance,nrow=3,top="(B) Spatial Categorical Predictors Pt. I")
ggsave(filename = "../../results/Continuous_Outcome_Modeling_results/2B.Spatial_I_Cat.png",plot =grid.arrange(bioregion_vs_abundance,georegion_vs_abundance,state_vs_abundance,nrow=3,top="(B) Spatial Categorical Predictors Pt. I"), width = 5, height = 4)
######################################################################################################
grid.arrange(island_vs_abundance,mpa_vs_abundance,site_vs_abundance,nrow=3,top="(C) Spatial Categorical Predictors Pt. II")
ggsave(filename = "../../results/Continuous_Outcome_Modeling_results/2C.Spatial_II_Cat.png",plot =grid.arrange(island_vs_abundance,mpa_vs_abundance,site_vs_abundance,nrow=3,top="(C) Spatial Categorical Predictors Pt. II"), width = 5, height = 4)
Moving outcome of interest to the first column
P.ochra_clean_data_backup <- P.ochra_clean_data
#names(P.ochra_clean_data) # total is 16, remove "marine site name"(3), "season sequence"(10), size sort order (14)
P.ochra_clean_data <- P.ochra_clean_data[c(16,1,2,4,5,6,7,8,9,11,12,13,15,17,18,19,20,21,22,23)]
names(P.ochra_clean_data) # double checking
## [1] "total" "group_code"
## [3] "site_code" "marine_sort_order"
## [5] "latitude" "longitude"
## [7] "marine_common_season" "marine_season_code"
## [9] "marine_common_year" "season_name"
## [11] "method_code" "species_code"
## [13] "size_bin" "mpa_region"
## [15] "georegion" "bioregion"
## [17] "island" "group_code_UCSC_other"
## [19] "method_code_IP_other" "state"
set.seed(123)
trainset <- caret::createDataPartition(y = P.ochra_clean_data$total, p = 0.7, list = FALSE)
data_train = P.ochra_clean_data[trainset,] #extract observations/rows for training, assign to new variable
data_test = P.ochra_clean_data[-trainset,] #do the same for the test set
outcome = data_train$total
Nobs=nrow(data_train)
base_RMSE <- sqrt(sum((outcome-mean(outcome))^2)/Nobs)
print(sprintf('RMSE of baseline model is %f',base_RMSE))
## [1] "RMSE of baseline model is 17.132094"
#There is probably a nicer tidyverse way of doing this. I just couldn't think of it, so did it this way.
set.seed(1111) #makes each code block reproducible
fitControl <- caret::trainControl(method="repeatedcv",number=5,repeats=5) #setting CV method for caret
Npred <- ncol(data_train)-1 # number of predictors
resultmat <- data.frame(Variable = names(data_train)[-1], RMSE = rep(0,Npred)) #store values for RMSE for each variable
for (n in 2:ncol(data_train)) #loop over each predictor. For this to work, outcome must be in 1st column
{
fit1 <- train( as.formula(paste("total ~",names(data_train)[n])) , data = data_train, method = "lm", trControl = fitControl)
resultmat[n-1,2]= fit1$results$RMSE
}
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info =
## trainInfo, : There were missing values in resampled performance measures.
resultmat
## Variable RMSE
## 1 group_code 16.71263
## 2 site_code 15.54159
## 3 marine_sort_order 16.96032
## 4 latitude 16.92755
## 5 longitude 16.89297
## 6 marine_common_season 17.05367
## 7 marine_season_code 16.82203
## 8 marine_common_year 17.06125
## 9 season_name 16.91164
## 10 method_code 16.96123
## 11 species_code 17.07703
## 12 size_bin 16.70304
## 13 mpa_region 16.49593
## 14 georegion 16.48091
## 15 bioregion 16.49852
## 16 island 16.98094
## 17 group_code_UCSC_other 16.81525
## 18 method_code_IP_other 16.93302
## 19 state 16.93342
resultmat_ordered <- resultmat %>%
mutate(Variable = factor(Variable, levels = c("site_code", "georegion", "mpa_region", "bioregion", "size_bin", "group_code", "group_code_UCSC_other", "marine_season_code", "longitude", "season_name", "latitude", "method_code_IP_other", "state", "marine_sort_order", "method_code", "island", "marine_common_season", "marine_common_year", "species_code"))) %>%
arrange(Variable)
resultmat_ordered$RMSE <- round(resultmat_ordered$RMSE,2)
Single_Predictor_RMSE <- resultmat_ordered %>% ggplot(aes(x = Variable, y=RMSE, label =RMSE)) + ylim(15.4,17.4) + geom_hline(yintercept=base_RMSE,color='red', size=2,linetype="dashed") + geom_text(stat='identity',color="black", size=5, face="bold",fill="lightgrey", angle=90) + theme(title = element_text(size=12), axis.title.y = element_text(size=14),axis.title.x = element_blank(),axis.text.y = element_text(size=12),axis.text.x = element_text(size=8,angle=35,hjust=1))
## Warning: Ignoring unknown parameters: face, fill
Single_Predictor_RMSE
ggsave(filename = "../../results/Continuous_Outcome_Modeling_results/3.Single_Predictor_RMSE.png",plot = Single_Predictor_RMSE, width = 5, height = 4)
set.seed(1111) #makes each code block reproducible
fitControl <- caret::trainControl(method="repeatedcv",number=5,repeats=5)
fit2 <- train(total ~ ., data = data_train, method = "lm", trControl = fitControl)
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
fit3 <- train(total ~ ., data = data_train, method = "knn", trControl = fitControl)
fit4 <- train(total ~ ., data = data_train, method = "earth", trControl = fitControl)
## Loading required package: earth
## Warning: package 'earth' was built under R version 3.5.2
## Loading required package: Formula
## Loading required package: plotmo
## Warning: package 'plotmo' was built under R version 3.5.2
## Loading required package: plotrix
## Warning: package 'plotrix' was built under R version 3.5.2
##
## Attaching package: 'plotrix'
## The following object is masked from 'package:scales':
##
## rescale
## Loading required package: TeachingDemos
print(sprintf('RMSE of lm/knn/mars model %f/%f/%f', fit2$results$RMSE, min(fit3$results$RMSE), min(fit4$results$RMSE) ))
## [1] "RMSE of lm/knn/mars model 15.120671/11.095066/15.157308"
library(knitr)
#install.packages("kableExtra")
library(kableExtra)
## Warning: package 'kableExtra' was built under R version 3.5.2
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:skimr':
##
## kable
## The following object is masked from 'package:dplyr':
##
## group_rows
Model <- c("knn","lm","mars")
RMSE <- c(11.095,15.121,15.157)
Multipred_df <- data.frame(Model,RMSE)
Multipred_df
## Model RMSE
## 1 knn 11.095
## 2 lm 15.121
## 3 mars 15.157
Multipred_df_ordered <- Multipred_df %>%
mutate(Model = factor(Model, levels = c("knn","lm","mars"))) %>%
arrange(Model)
Multi_Predictor_RMSE <- Multipred_df_ordered %>% ggplot(aes(x = Model, y=RMSE, label=RMSE)) +
geom_label(stat='identity',color="black", size=5, face="bold",fill="lightgrey") + ylim(11.5,17.5) + geom_hline(yintercept=17.132,color='red', size=2,linetype="dashed") + theme(title = element_text(size=12), axis.title.y = element_text(size=14),axis.title.x = element_blank(),axis.text.y = element_text(size=12),axis.text.x = element_text(size=14))
## Warning: Ignoring unknown parameters: face
#grid.arrange(Single_Predictor_RMSE,Multi_Predictor_RMSE,nrow=2)
Multi_Predictor_RMSE
## Warning: Removed 1 rows containing missing values (geom_label).
#ggsave(filename = "../../results/Continuous_Outcome_Modeling_results/3.B.Multi_Predictor_RMSE.png",plot = grid.arrange(Single_Predictor_RMSE,Multi_Predictor_RMSE,nrow=2), width = 5, height = 4)
#ggsave(filename = "../../results/Continuous_Outcome_Modeling_results/3.Predictor_RMSE.png",plot = grid.arrange(Single_Predictor_RMSE,Multi_Predictor_RMSE,nrow=2), width = 5, height = 4)
Next, we noticed during our exploratory analysis that it might be useful to center and scale predictors. So let’s do that now. With caret, one can do that by providing the preProc setting inside the train function. Set it to center and scale the data, then run the 3 models from above again.
#write code that repeats the multi-predictor fits from above, but this time applies centering and scaling of variables.
#look at the RMSE for the new fits
fitControl <- trainControl(method="repeatedcv",number=5,repeats=5)
fit2a <- train(total ~ ., data = data_train, method = "lm", trControl = fitControl, preProc = c("center","scale"))
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: species_codeK.tunicata,
## species_codeP.ochraceus
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: species_codeK.tunicata,
## species_codeP.ochraceus
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19,
## uniqueCut = 10, : These variables have zero variances: site_codeSCRE,
## species_codeK.tunicata, species_codeP.ochraceus
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19,
## uniqueCut = 10, : These variables have zero variances: site_codeSOB,
## species_codeK.tunicata, species_codeP.ochraceus
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19,
## uniqueCut = 10, : These variables have zero variances: site_codeSAU,
## species_codeK.tunicata, species_codeP.ochraceus
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: species_codeK.tunicata,
## species_codeP.ochraceus
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: species_codeK.tunicata,
## species_codeP.ochraceus
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19,
## uniqueCut = 10, : These variables have zero variances: site_codeSAU,
## species_codeK.tunicata, species_codeP.ochraceus
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19,
## uniqueCut = 10, : These variables have zero variances: site_codeSOB,
## species_codeK.tunicata, species_codeP.ochraceus
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: species_codeK.tunicata,
## species_codeP.ochraceus
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19,
## uniqueCut = 10, : These variables have zero variances: site_codeSOB,
## species_codeK.tunicata, species_codeP.ochraceus
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: site_codeSAU, site_codeSCRE,
## species_codeK.tunicata, species_codeP.ochraceus
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: species_codeK.tunicata,
## species_codeP.ochraceus
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: species_codeK.tunicata,
## species_codeP.ochraceus
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: species_codeK.tunicata,
## species_codeP.ochraceus
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: species_codeK.tunicata,
## species_codeP.ochraceus
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: species_codeK.tunicata,
## species_codeP.ochraceus
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: species_codeK.tunicata,
## species_codeP.ochraceus
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: site_codeSAU, site_codeSOB,
## species_codeK.tunicata, species_codeP.ochraceus
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: species_codeK.tunicata,
## species_codeP.ochraceus
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: species_codeK.tunicata,
## species_codeP.ochraceus
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19,
## uniqueCut = 10, : These variables have zero variances: site_codeSOB,
## species_codeK.tunicata, species_codeP.ochraceus
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: species_codeK.tunicata,
## species_codeP.ochraceus
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19,
## uniqueCut = 10, : These variables have zero variances: site_codeSAU,
## species_codeK.tunicata, species_codeP.ochraceus
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: species_codeK.tunicata,
## species_codeP.ochraceus
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: species_codeK.tunicata,
## species_codeP.ochraceus
fit3a <- train(total ~ ., data = data_train, method = "knn", trControl = fitControl, preProc = c("center","scale"))
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: species_codeK.tunicata,
## species_codeP.ochraceus
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: species_codeK.tunicata,
## species_codeP.ochraceus
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: species_codeK.tunicata,
## species_codeP.ochraceus
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: species_codeK.tunicata,
## species_codeP.ochraceus
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: species_codeK.tunicata,
## species_codeP.ochraceus
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: species_codeK.tunicata,
## species_codeP.ochraceus
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: site_codeSAU, site_codeSOB,
## species_codeK.tunicata, species_codeP.ochraceus
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: site_codeSAU, site_codeSOB,
## species_codeK.tunicata, species_codeP.ochraceus
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: site_codeSAU, site_codeSOB,
## species_codeK.tunicata, species_codeP.ochraceus
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: species_codeK.tunicata,
## species_codeP.ochraceus
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: species_codeK.tunicata,
## species_codeP.ochraceus
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: species_codeK.tunicata,
## species_codeP.ochraceus
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: species_codeK.tunicata,
## species_codeP.ochraceus
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: species_codeK.tunicata,
## species_codeP.ochraceus
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: species_codeK.tunicata,
## species_codeP.ochraceus
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: species_codeK.tunicata,
## species_codeP.ochraceus
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: species_codeK.tunicata,
## species_codeP.ochraceus
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: species_codeK.tunicata,
## species_codeP.ochraceus
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: species_codeK.tunicata,
## species_codeP.ochraceus
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: species_codeK.tunicata,
## species_codeP.ochraceus
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: species_codeK.tunicata,
## species_codeP.ochraceus
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19,
## uniqueCut = 10, : These variables have zero variances: site_codeSAU,
## species_codeK.tunicata, species_codeP.ochraceus
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19,
## uniqueCut = 10, : These variables have zero variances: site_codeSAU,
## species_codeK.tunicata, species_codeP.ochraceus
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19,
## uniqueCut = 10, : These variables have zero variances: site_codeSAU,
## species_codeK.tunicata, species_codeP.ochraceus
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19,
## uniqueCut = 10, : These variables have zero variances: site_codeSOB,
## species_codeK.tunicata, species_codeP.ochraceus
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19,
## uniqueCut = 10, : These variables have zero variances: site_codeSOB,
## species_codeK.tunicata, species_codeP.ochraceus
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19,
## uniqueCut = 10, : These variables have zero variances: site_codeSOB,
## species_codeK.tunicata, species_codeP.ochraceus
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: species_codeK.tunicata,
## species_codeP.ochraceus
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: species_codeK.tunicata,
## species_codeP.ochraceus
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: species_codeK.tunicata,
## species_codeP.ochraceus
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: species_codeK.tunicata,
## species_codeP.ochraceus
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: species_codeK.tunicata,
## species_codeP.ochraceus
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: species_codeK.tunicata,
## species_codeP.ochraceus
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19,
## uniqueCut = 10, : These variables have zero variances: site_codeSOB,
## species_codeK.tunicata, species_codeP.ochraceus
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19,
## uniqueCut = 10, : These variables have zero variances: site_codeSOB,
## species_codeK.tunicata, species_codeP.ochraceus
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19,
## uniqueCut = 10, : These variables have zero variances: site_codeSOB,
## species_codeK.tunicata, species_codeP.ochraceus
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: species_codeK.tunicata,
## species_codeP.ochraceus
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: species_codeK.tunicata,
## species_codeP.ochraceus
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: species_codeK.tunicata,
## species_codeP.ochraceus
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19,
## uniqueCut = 10, : These variables have zero variances: site_codeSAU,
## species_codeK.tunicata, species_codeP.ochraceus
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19,
## uniqueCut = 10, : These variables have zero variances: site_codeSAU,
## species_codeK.tunicata, species_codeP.ochraceus
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19,
## uniqueCut = 10, : These variables have zero variances: site_codeSAU,
## species_codeK.tunicata, species_codeP.ochraceus
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: species_codeK.tunicata,
## species_codeP.ochraceus
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: species_codeK.tunicata,
## species_codeP.ochraceus
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: species_codeK.tunicata,
## species_codeP.ochraceus
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19,
## uniqueCut = 10, : These variables have zero variances: site_codeSAU,
## species_codeK.tunicata, species_codeP.ochraceus
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19,
## uniqueCut = 10, : These variables have zero variances: site_codeSAU,
## species_codeK.tunicata, species_codeP.ochraceus
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19,
## uniqueCut = 10, : These variables have zero variances: site_codeSAU,
## species_codeK.tunicata, species_codeP.ochraceus
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19,
## uniqueCut = 10, : These variables have zero variances: site_codeSOB,
## species_codeK.tunicata, species_codeP.ochraceus
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19,
## uniqueCut = 10, : These variables have zero variances: site_codeSOB,
## species_codeK.tunicata, species_codeP.ochraceus
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19,
## uniqueCut = 10, : These variables have zero variances: site_codeSOB,
## species_codeK.tunicata, species_codeP.ochraceus
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: species_codeK.tunicata,
## species_codeP.ochraceus
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: species_codeK.tunicata,
## species_codeP.ochraceus
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: species_codeK.tunicata,
## species_codeP.ochraceus
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: species_codeK.tunicata,
## species_codeP.ochraceus
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: species_codeK.tunicata,
## species_codeP.ochraceus
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: species_codeK.tunicata,
## species_codeP.ochraceus
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: species_codeK.tunicata,
## species_codeP.ochraceus
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: species_codeK.tunicata,
## species_codeP.ochraceus
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: species_codeK.tunicata,
## species_codeP.ochraceus
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: species_codeK.tunicata,
## species_codeP.ochraceus
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: species_codeK.tunicata,
## species_codeP.ochraceus
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: species_codeK.tunicata,
## species_codeP.ochraceus
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: species_codeK.tunicata,
## species_codeP.ochraceus
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: species_codeK.tunicata,
## species_codeP.ochraceus
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: species_codeK.tunicata,
## species_codeP.ochraceus
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: site_codeSAU, site_codeSOB,
## species_codeK.tunicata, species_codeP.ochraceus
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: site_codeSAU, site_codeSOB,
## species_codeK.tunicata, species_codeP.ochraceus
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: site_codeSAU, site_codeSOB,
## species_codeK.tunicata, species_codeP.ochraceus
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: species_codeK.tunicata,
## species_codeP.ochraceus
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: species_codeK.tunicata,
## species_codeP.ochraceus
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: species_codeK.tunicata,
## species_codeP.ochraceus
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: species_codeK.tunicata,
## species_codeP.ochraceus
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: species_codeK.tunicata,
## species_codeP.ochraceus
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: species_codeK.tunicata,
## species_codeP.ochraceus
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: species_codeK.tunicata,
## species_codeP.ochraceus
fit4a <- train(total ~ ., data = data_train, method = "earth", trControl = fitControl, preProc = c("center","scale"))
## Warning in preProcess.default(method = c("center", "scale"),
## x = structure(c(0, : These variables have zero variances:
## species_codeK.tunicata, species_codeP.ochraceus
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: species_codeK.tunicata,
## species_codeP.ochraceus
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19,
## uniqueCut = 10, : These variables have zero variances: site_codeSOB,
## species_codeK.tunicata, species_codeP.ochraceus
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: species_codeK.tunicata,
## species_codeP.ochraceus
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: species_codeK.tunicata,
## species_codeP.ochraceus
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19,
## uniqueCut = 10, : These variables have zero variances: site_codeSAU,
## species_codeK.tunicata, species_codeP.ochraceus
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: species_codeK.tunicata,
## species_codeP.ochraceus
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: species_codeK.tunicata,
## species_codeP.ochraceus
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: species_codeK.tunicata,
## species_codeP.ochraceus
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: site_codeSAU, site_codeSOB,
## species_codeK.tunicata, species_codeP.ochraceus
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: species_codeK.tunicata,
## species_codeP.ochraceus
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: species_codeK.tunicata,
## species_codeP.ochraceus
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19,
## uniqueCut = 10, : These variables have zero variances: site_codeSCRE,
## species_codeK.tunicata, species_codeP.ochraceus
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: species_codeK.tunicata,
## species_codeP.ochraceus
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: species_codeK.tunicata,
## species_codeP.ochraceus
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: site_codeSAU, site_codeSOB,
## species_codeK.tunicata, species_codeP.ochraceus
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: species_codeK.tunicata,
## species_codeP.ochraceus
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19,
## uniqueCut = 10, : These variables have zero variances: site_codeSOB,
## species_codeK.tunicata, species_codeP.ochraceus
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19,
## uniqueCut = 10, : These variables have zero variances: site_codeSAU,
## species_codeK.tunicata, species_codeP.ochraceus
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: species_codeK.tunicata,
## species_codeP.ochraceus
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: species_codeK.tunicata,
## species_codeP.ochraceus
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19,
## uniqueCut = 10, : These variables have zero variances: site_codeSOB,
## species_codeK.tunicata, species_codeP.ochraceus
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: species_codeK.tunicata,
## species_codeP.ochraceus
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: species_codeK.tunicata,
## species_codeP.ochraceus
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: species_codeK.tunicata,
## species_codeP.ochraceus
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19,
## uniqueCut = 10, : These variables have zero variances: site_codeSAU,
## species_codeK.tunicata, species_codeP.ochraceus
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: species_codeK.tunicata,
## species_codeP.ochraceus
print(sprintf('RMSE of lm/knn/mars model %f/%f/%f', fit2a$results$RMSE, min(fit3a$results$RMSE), min(fit4a$results$RMSE) ))
## [1] "RMSE of lm/knn/mars model 15.156861/15.029380/15.202499"
before: “RMSE of lm/knn/mars model 15.120671/11.095066/15.157308” after: “RMSE of lm/knn/mars model 15.156861/15.029380/15.202499” RUN ON CAT RUN SINGLE
library(knitr)
library(kableExtra)
Model2 <- c("knn","knn_cs","lm","lm_cs","mars","mars_cs")
RMSE2 <- c(11.095,15.029,15.121,15.157,15.157,15.202)
Multipred_df_cs <- data.frame(Model2,RMSE2)
Multipred_df_cs
## Model2 RMSE2
## 1 knn 11.095
## 2 knn_cs 15.029
## 3 lm 15.121
## 4 lm_cs 15.157
## 5 mars 15.157
## 6 mars_cs 15.202
Multipred_df_ordered_cs <- Multipred_df_cs %>%
mutate(Model2 = factor(Model2, levels = c("knn","knn_cs","lm","lm_cs","mars","mars_cs"))) %>%
arrange(Model2)
Multi_Predictor_RMSE_cs <- Multipred_df_ordered_cs %>% ggplot(aes(x = Model2, y=RMSE2, label=RMSE2)) +
geom_text(stat='identity',color="black", size=6, face="bold",fill="lightgrey",angle=90) + ylim(10.5,17.5) + geom_hline(yintercept=base_RMSE,color='red', size=2,linetype="dashed") + theme(title = element_text(size=12), axis.title.y = element_text(size=14),axis.title.x = element_blank(),axis.text.y = element_text(size=12),axis.text.x = element_text(size=14)) + ylab("RMSE")
## Warning: Ignoring unknown parameters: face, fill
Multi_Predictor_RMSE_cs
#grid.arrange(Single_Predictor_RMSE,Multi_Predictor_RMSE,nrow=2)
ggsave(filename = "../../results/Continuous_Outcome_Modeling_results/4.Multi_Predictor_RMSE_cs.png",plot = Multi_Predictor_RMSE_cs, width = 5, height = 4)
#Use the `resamples` function in caret to extract uncertainty from the 3 models fit to the data that doesn't have predictor pre-processing, then plot it
resamps <- caret::resamples(list(LM = fit2, KNN = fit3, MARS = fit4))
bwplot(resamps, layout = c(3, 1))
#ggsave(filename = "../../results/Continuous_Outcome_Modeling_results/5.Model_Uncertainty.png",plot = bwplot(resamps, layout = c(3, 1)), width = 5, height = 4)
#Write code to get model predictions for the outcome on the training data, and plot it as function of actual outcome values.
#also compute residuals (the difference between prediction and actual outcome) and plot that
pred <- predict(fit3,data_train)
outcome <- data_train$total
resid <- outcome - pred
df_residual <- data.frame(outcome,pred)
#plot(outcome,pred,xlim = c(0,270),ylim=c(0,250))
df_residual %>% ggplot(aes(x=outcome,y=pred)) + geom_point()
myPaletteresid <- colorRampPalette(viridis_pal(begin = 0, end = 1,option = "plasma")(8),)
title <- "Residuals"
residual_colours <- scale_colour_gradientn(title,colours = myPaletteresid(50))
#skim(resid)
Index <- c(1:7993)
residual_index <- data.frame(Index,resid)
residuals_plot <- residual_index %>% ggplot(aes(x=Index,y=resid,col=resid)) + geom_point(alpha=0.4) + residual_colours + theme(axis.title.x = element_blank()) + ylab("Residuals") + geom_hline(yintercept = 0,color='goldenrod', size=1.25,linetype="dashed")
residuals_plot
ggsave(filename = "../../results/Continuous_Outcome_Modeling_results/5.Residuals.png",plot = residuals_plot, width = 5, height = 4)
#plot(resid)
outcome = data_test$total
Nobs=nrow(data_test)
base_RMSE <- sqrt(sum((outcome-mean(outcome))^2)/Nobs)
print(sprintf('RMSE of baseline model is %f',base_RMSE))
## [1] "RMSE of baseline model is 18.028630"
#Write code that computes model predictions and for test data, then compute SSR and RMSE.
pred <- predict(fit3,data_test)
outcome <- data_test$total
SSR <- sum( (outcome - pred)^2)
RMSE = sqrt(SSR/length(outcome))
print(RMSE)
## [1] 10.13136
Data <- c("Null_train","knn_train","Null_test","knn_test")
RMSE <- c(17.132,11.095,18.029,10.131)
Final_Results <- data.frame(Data,RMSE)
Final_Results
## Data RMSE
## 1 Null_train 17.132
## 2 knn_train 11.095
## 3 Null_test 18.029
## 4 knn_test 10.131
Final_Results_ordered <- Final_Results %>%
mutate(Data = factor(Data, levels = c("Null_train","knn_train","Null_test","knn_test"))) %>%
arrange(Data)
Final_Results_ordered_plot <- Final_Results_ordered %>% ggplot(aes(x = Data, y=RMSE, label=RMSE)) +
geom_label(stat='identity',fill=RMSE_colors, size=7, face="bold") + ylim(9.5,18.5) + theme(title = element_text(size=12), axis.title.y = element_text(size=14),axis.title.x = element_blank(),axis.text.y = element_text(size=12),axis.text.x = element_text(size=14))
## Warning: Ignoring unknown parameters: face
Final_Results_ordered_plot
ggsave(filename = "../../results/Continuous_Outcome_Modeling_results/6.final_results_RMSE.png",plot = Final_Results_ordered_plot, width = 5, height = 4)